{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8d7e4358",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "05dd7e0e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'1.3.4'"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pd.__version__"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "a1d1f971",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('yellow_tripdata_2021-01.csv', nrows=100)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "f9e76d7f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)\n",
    "df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f84ca922",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sqlalchemy import create_engine"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "id": "339a5c72",
   "metadata": {},
   "outputs": [],
   "source": [
    "engine = create_engine('postgresql://root:root@localhost:5432/ny_taxi')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "791b0846",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "CREATE TABLE yellow_taxi_data (\n",
      "\t\"VendorID\" BIGINT, \n",
      "\ttpep_pickup_datetime TIMESTAMP WITHOUT TIME ZONE, \n",
      "\ttpep_dropoff_datetime TIMESTAMP WITHOUT TIME ZONE, \n",
      "\tpassenger_count BIGINT, \n",
      "\ttrip_distance FLOAT(53), \n",
      "\t\"RatecodeID\" BIGINT, \n",
      "\tstore_and_fwd_flag TEXT, \n",
      "\t\"PULocationID\" BIGINT, \n",
      "\t\"DOLocationID\" BIGINT, \n",
      "\tpayment_type BIGINT, \n",
      "\tfare_amount FLOAT(53), \n",
      "\textra FLOAT(53), \n",
      "\tmta_tax FLOAT(53), \n",
      "\ttip_amount FLOAT(53), \n",
      "\ttolls_amount FLOAT(53), \n",
      "\timprovement_surcharge FLOAT(53), \n",
      "\ttotal_amount FLOAT(53), \n",
      "\tcongestion_surcharge FLOAT(53)\n",
      ")\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(pd.io.sql.get_schema(df, name='yellow_taxi_data', con=engine))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "de7b9007",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_iter = pd.read_csv('yellow_tripdata_2021-01.csv', iterator=True, chunksize=100000)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "80ba9bc5",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = next(df_iter)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "8ec24c0e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "100000"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "3c72593b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)\n",
    "df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b652a771",
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "8252c284",
   "metadata": {},
   "outputs": [],
   "source": [
    "df.head(n=0).to_sql(name='yellow_taxi_data', con=engine, if_exists='replace')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "98959ffe",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 10.4 s\n"
     ]
    }
   ],
   "source": [
    "%time df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "42bb09d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from time import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "09cd464a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "inserted another chunk..., took 16.115 second\n",
      "inserted another chunk..., took 15.076 second\n",
      "inserted another chunk..., took 14.321 second\n",
      "inserted another chunk..., took 17.030 second\n",
      "inserted another chunk..., took 16.002 second\n",
      "inserted another chunk..., took 15.990 second\n",
      "inserted another chunk..., took 16.380 second\n",
      "inserted another chunk..., took 15.642 second\n",
      "inserted another chunk..., took 14.489 second\n",
      "inserted another chunk..., took 14.900 second\n",
      "inserted another chunk..., took 14.711 second\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\alexe\\Anaconda3\\lib\\site-packages\\IPython\\core\\interactiveshell.py:3364: DtypeWarning: Columns (6) have mixed types.Specify dtype option on import or set low_memory=False.\n",
      "  if (await self.run_code(code, result,  async_=asy)):\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "inserted another chunk..., took 15.301 second\n",
      "inserted another chunk..., took 9.552 second\n"
     ]
    },
    {
     "ename": "StopIteration",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mStopIteration\u001b[0m                             Traceback (most recent call last)",
      "\u001b[1;32m~\\AppData\\Local\\Temp/ipykernel_14212/2024965355.py\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m      2\u001b[0m     \u001b[0mt_start\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      3\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 4\u001b[1;33m     \u001b[0mdf\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf_iter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      6\u001b[0m     \u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtpep_pickup_datetime\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtpep_pickup_datetime\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers\\readers.py\u001b[0m in \u001b[0;36m__next__\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m   1022\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__next__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1023\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1024\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget_chunk\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1025\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1026\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers\\readers.py\u001b[0m in \u001b[0;36mget_chunk\u001b[1;34m(self, size)\u001b[0m\n\u001b[0;32m   1072\u001b[0m                 \u001b[1;32mraise\u001b[0m \u001b[0mStopIteration\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1073\u001b[0m             \u001b[0msize\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mmin\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mnrows\u001b[0m \u001b[1;33m-\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_currow\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1074\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msize\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1075\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1076\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__enter__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers\\readers.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m   1045\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1046\u001b[0m         \u001b[0mnrows\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mvalidate_integer\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"nrows\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1047\u001b[1;33m         \u001b[0mindex\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcol_dict\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1048\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1049\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mindex\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\io\\parsers\\c_parser_wrapper.py\u001b[0m in \u001b[0;36mread\u001b[1;34m(self, nrows)\u001b[0m\n\u001b[0;32m    221\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    222\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mlow_memory\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 223\u001b[1;33m                 \u001b[0mchunks\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_low_memory\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mnrows\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    224\u001b[0m                 \u001b[1;31m# destructive to chunks\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    225\u001b[0m                 \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0m_concatenate_chunks\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchunks\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\Anaconda3\\lib\\site-packages\\pandas\\_libs\\parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.read_low_memory\u001b[1;34m()\u001b[0m\n",
      "\u001b[1;31mStopIteration\u001b[0m: "
     ]
    }
   ],
   "source": [
    "while True: \n",
    "    t_start = time()\n",
    "\n",
    "    df = next(df_iter)\n",
    "\n",
    "    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)\n",
    "    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)\n",
    "    \n",
    "    df.to_sql(name='yellow_taxi_data', con=engine, if_exists='append')\n",
    "\n",
    "    t_end = time()\n",
    "\n",
    "    print('inserted another chunk, took %.3f second' % (t_end - t_start))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "c2178a11",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "--2022-01-15 23:57:02--  https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv\n",
      "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.113.61\n",
      "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.113.61|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 12322 (12K) [application/octet-stream]\n",
      "Saving to: 'taxi+_zone_lookup.csv'\n",
      "\n",
      "     0K .......... ..                                         100%  910K=0.01s\n",
      "\n",
      "2022-01-15 23:57:02 (910 KB/s) - 'taxi+_zone_lookup.csv' saved [12322/12322]\n",
      "\n"
     ]
    }
   ],
   "source": [
    "!wget https://s3.amazonaws.com/nyc-tlc/misc/taxi+_zone_lookup.csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "bdefce25",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_zones = pd.read_csv('taxi+_zone_lookup.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "c99e27b5",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>LocationID</th>\n",
       "      <th>Borough</th>\n",
       "      <th>Zone</th>\n",
       "      <th>service_zone</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>EWR</td>\n",
       "      <td>Newark Airport</td>\n",
       "      <td>EWR</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>Queens</td>\n",
       "      <td>Jamaica Bay</td>\n",
       "      <td>Boro Zone</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>Bronx</td>\n",
       "      <td>Allerton/Pelham Gardens</td>\n",
       "      <td>Boro Zone</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>Manhattan</td>\n",
       "      <td>Alphabet City</td>\n",
       "      <td>Yellow Zone</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>Staten Island</td>\n",
       "      <td>Arden Heights</td>\n",
       "      <td>Boro Zone</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   LocationID        Borough                     Zone service_zone\n",
       "0           1            EWR           Newark Airport          EWR\n",
       "1           2         Queens              Jamaica Bay    Boro Zone\n",
       "2           3          Bronx  Allerton/Pelham Gardens    Boro Zone\n",
       "3           4      Manhattan            Alphabet City  Yellow Zone\n",
       "4           5  Staten Island            Arden Heights    Boro Zone"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_zones.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "id": "5134c2e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_zones.to_sql(name='zones', con=engine, if_exists='replace')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ffae4651",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
